library(h2o)
h2o.init()

job.titles.path <- h2o:::.h2o.locate("smalldata/craigslistJobTitles.csv")
job.titles <- h2o.importFile(job.titles.path, destination_frame = "jobtitles",
                             col.names = c("category", "jobtitle"), col.types = c("Enum", "String"), header = TRUE)

STOP_WORDS = c("ax","i","you","edu","s","t","m","subject","can","lines","re","what",
               "there","all","we","one","the","a","an","of","or","in","for","by","on",
               "but","is","in","a","not","with","as","was","if","they","are","this","and","it","have",
               "from","at","my","be","by","not","that","to","from","com","org","like","likes","so")

tokenize <- function(sentences, stop.words = STOP_WORDS) {
    tokenized <- h2o.tokenize(sentences, "\\\\W+")

    # convert to lower case
    tokenized.lower <- h2o.tolower(tokenized)
    # remove short words (less than 2 characters)
    tokenized.lengths <- h2o.nchar(tokenized.lower)
    tokenized.filtered <- tokenized.lower[is.na(tokenized.lengths) || tokenized.lengths >= 2,]
    # remove words that contain numbers
    tokenized.words <- tokenized.filtered[h2o.grep("[0-9]", tokenized.filtered, invert = TRUE, output.logical = TRUE),]

    # remove stop words
    tokenized.words[is.na(tokenized.words) || (! tokenized.words %in% STOP_WORDS),]
}

# `predict` conflicts with generic fn defined in R.stats
.predict <- function(job.title, w2v, gbm) {
    words <- tokenize(as.character(as.h2o(job.title)))
    job.title.vec <- h2o.transform(w2v, words, aggregate_method = "AVERAGE")
    h2o.predict(gbm, job.title.vec)
}

print("Break job titles into sequence of words")
words <- tokenize(job.titles$jobtitle)

print("Build word2vec model")
w2v.model <- h2o.word2vec(words, sent_sample_rate = 0, epochs = 10)

print("Sanity check - find synonyms for the word 'teacher'")
print(h2o.findSynonyms(w2v.model, "teacher", count = 5))

print("Calculate a vector for each job title")
job.title.vecs <- h2o.transform(w2v.model, words, aggregate_method = "AVERAGE")

print("Prepare training&validation data (keep only job titles made of known words)")
valid.job.titles <- ! is.na(job.title.vecs$C1)
data <- h2o.cbind(job.titles[valid.job.titles, "category"], job.title.vecs[valid.job.titles, ])
data.split <- h2o.splitFrame(data, ratios = 0.8)

print("Build a basic GBM model")
gbm.model <- h2o.gbm(x = names(job.title.vecs), y = "category",
                     training_frame = data.split[[1]], validation_frame = data.split[[2]])

print("Predict!")
print(.predict("school teacher having holidays every month", w2v.model, gbm.model))
print(.predict("developer with 3+ Java experience, jumping", w2v.model, gbm.model))
print(.predict("Financial accountant CPA preferred", w2v.model, gbm.model))
